{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "sys.path.append(os.path.abspath('../../code'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import regex\n",
    "import json\n",
    "from collections import Counter\n",
    "\n",
    "import numpy as np\n",
    "np.set_printoptions(formatter={'float': lambda x: \"{0:0.3f}\".format(x)})\n",
    "\n",
    "from utils.io import read_label_config\n",
    "from utils.corpus import DoccanoAnnotationsCorpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from types import SimpleNamespace\n",
    "\n",
    "args = SimpleNamespace()\n",
    "args.data_path = '../../data/annotation/annotations/'\n",
    "args.data_folder_pattern = 'de-manifestos'\n",
    "args.data_annotations_folder = 'annotations'\n",
    "args.data_file_format = 'jsonl'\n",
    "args.keep_annotator = 'emarie,sjasmin'\n",
    "\n",
    "args.label_config_file = '../../data/annotation/doccano_label_config.json'\n",
    "\n",
    "args.output_file = '../../data/annotation/parsed/de-manifestos_annotations.jsonl'\n",
    "args.overwrite_output = False\n",
    "\n",
    "args.verbose = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['../../data/annotation/annotations/de-manifestos-round-01/annotations/emarie.jsonl',\n",
       " '../../data/annotation/annotations/de-manifestos-round-01/annotations/sjasmin.jsonl',\n",
       " '../../data/annotation/annotations/de-manifestos-round-02/annotations/emarie.jsonl',\n",
       " '../../data/annotation/annotations/de-manifestos-round-02/annotations/sjasmin.jsonl']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subdirs = [os.path.join(args.data_path, d, args.data_annotations_folder) for d in os.listdir(args.data_path) if d.startswith(args.data_folder_pattern)]\n",
    "annotators = [a.strip() for a in args.keep_annotator.split(',')]\n",
    "fps = [os.path.join(d, a+'.'+args.data_file_format) for a in annotators for d in subdirs]\n",
    "fps.sort()\n",
    "fps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read the label config\n",
    "cat2code = read_label_config(args.label_config_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: annotation of token 13 in document 16ce1c6a9346d2dbd57182cdaaa82852 disambiguated\n"
     ]
    }
   ],
   "source": [
    "# read first (we merge the rest to this one)\n",
    "acorp = DoccanoAnnotationsCorpus(cat2code)\n",
    "annotator = os.path.basename(fps[0]).replace('.jsonl', '')\n",
    "acorp.load_from_jsonlines(fp=fps[0], annotator_id=annotator, verbose=args.verbose)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading annotations from file '../../data/annotation/annotations/de-manifestos-round-01/annotations/sjasmin.jsonl'\n",
      "Reading annotations from file '../../data/annotation/annotations/de-manifestos-round-02/annotations/emarie.jsonl'\n",
      "Reading annotations from file '../../data/annotation/annotations/de-manifestos-round-02/annotations/sjasmin.jsonl'\n"
     ]
    }
   ],
   "source": [
    "# merge remaining ones\n",
    "for fp in fps[1:]:\n",
    "    if args.verbose: print(f'Reading annotations from file \\'{fp}\\'')\n",
    "    tmp = DoccanoAnnotationsCorpus(cat2code)\n",
    "    annotator = os.path.basename(fp).replace('.jsonl', '')\n",
    "    tmp.load_from_jsonlines(fp=fp, annotator_id=annotator, verbose=args.verbose)\n",
    "    acorp.merge_annotated_corpus(tmp)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### merge gold labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "fp = os.path.join(args.data_path, 'ra-annotation-de-manifestos-review', 'annotations', 'all.jsonl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read the label config\n",
    "cats = ['SG', 'PG', 'PI', 'ORG', 'ISG', 'unsure']\n",
    "cats = [t+c for t in ['I-', 'B-'] for c in cats]\n",
    "cat2code_gold = {l+a: c+1 for a in ['-a', '-z'] for c, l in enumerate(cats)}\n",
    "cat2code_gold['O'] = acorp.outside_label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "gold_corp = DoccanoAnnotationsCorpus(cat2code_gold, n_types=len(cats))\n",
    "gold_corp.load_from_jsonlines(fp=fp, annotator_id='GOLD', verbose=args.verbose)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check that doc IDs in gold data match those in the annotations\n",
    "all([doc_id in acorp.doc_ids for doc_id in gold_corp.doc_ids])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge the gold labels to the annotations corpus\n",
    "acorp.merge_gold_corpus(gold_corp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1mf5c4848a1a16a11751deada5456fc2f7\u001b[0m\n",
      "'Die AfD setzt sich für gentechnikfrei erzeugte Lebensmittel aus der deutschen Landwirtschaft ein .'\n",
      " \u001b[44m   \u001b[49m \u001b[44m   \u001b[49m                                                             \u001b[44m         \u001b[49m \u001b[44m              \u001b[49m      \t(emarie)\n",
      " \u001b[44m   \u001b[49m \u001b[44m   \u001b[49m                                                                                           \t(sjasmin)\n",
      " \u001b[42m   \u001b[49m \u001b[42m   \u001b[49m                                                             \u001b[42m         \u001b[49m \u001b[42m              \u001b[49m      \t[GOLD]\n"
     ]
    }
   ],
   "source": [
    "print(acorp.docs[acorp.doc_id2idx[gold_corp.doc_ids[99]]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ensure data integrity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No. docs: 2927\n",
      "(array([1, 2]), array([2328,  599]))\n"
     ]
    }
   ],
   "source": [
    "print('No. docs:', acorp.ndocs)\n",
    "# how many singly/multiply annotated?\n",
    "print(np.unique(np.asarray([doc.n_annotations for doc in acorp.docs]), return_counts=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# identify duplicate texts (if any)\n",
    "texts = Counter()\n",
    "for doc in acorp.docs:\n",
    "    texts.update([doc.text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([1, 2]), array([2919,    4]))\n"
     ]
    }
   ],
   "source": [
    "# 4 sentences are verbatim duplicates (possible because we sampled based on within-manifesto sentence IDs)\n",
    "print(np.unique(np.asarray(list(texts.values())), return_counts=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get IDs of documents with dublicated text\n",
    "duplicated = [t for t, n in texts.most_common() if n > 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# map doc IDs to texts\n",
    "duplicates_ids = dict()\n",
    "for doc in acorp.docs:\n",
    "    if doc.text in duplicated:\n",
    "        if doc.text in duplicates_ids.keys():\n",
    "            duplicates_ids[doc.text].append(doc.id)\n",
    "        else:\n",
    "            duplicates_ids[doc.text] = [doc.id]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m2eff1ee7b7f8c9eb6ec562cb31ba2d53\u001b[0m\n",
      "'Ein Anspruch auf Teilzeit soll daher nur bestehen , wenn ein Kind betreut oder ein naher Angehöriger gepflegt wird .'\n",
      "                                                              \u001b[44m    \u001b[49m                  \u001b[44m     \u001b[49m \u001b[44m           \u001b[49m                \t(emarie)\n",
      "\u001b[1mce42c6df986b4c553898192a47712969\u001b[0m\n",
      "'Ein Anspruch auf Teilzeit soll daher nur bestehen , wenn ein Kind betreut oder ein naher Angehöriger gepflegt wird .'\n",
      "                                                              \u001b[44m    \u001b[49m                  \u001b[44m     \u001b[49m \u001b[44m           \u001b[49m                \t(emarie)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1ma979d2805a1561dc03903d3c55e74aaf\u001b[0m\n",
      "'Der Selbstbehalt bei der Berechnung von Arbeitslosengeld II ist sanktionsfrei zu erhöhen .'\n",
      "                                                                                           \t(emarie)\n",
      "\u001b[1mb9e57f088a4e9b63538befdeec44a28c\u001b[0m\n",
      "'Der Selbstbehalt bei der Berechnung von Arbeitslosengeld II ist sanktionsfrei zu erhöhen .'\n",
      "                                                                                           \t(emarie)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1me7b3f8d69a3b2fb6db83b9ea2817d90d\u001b[0m\n",
      "'Der überbordende Lobbyismus in Brüssel und Berlin muss eingedämmt werden .'\n",
      "                                                                           \t(emarie)\n",
      "\u001b[1mde426127ca7c4af939382b58c0b63dc9\u001b[0m\n",
      "'Der überbordende Lobbyismus in Brüssel und Berlin muss eingedämmt werden .'\n",
      "                                \u001b[44m       \u001b[49m     \u001b[44m      \u001b[49m                         \t(sjasmin)\n",
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1m62ae8599698ad61f0e6ba84dce622864\u001b[0m\n",
      "'Wir wollen Zivilklauseln in den Gesellschafterverträgen der Durchführungsorganisationen der deutschen Entwicklungszusammenarbeit verankern .'\n",
      "                                                                                                                                             \t(sjasmin)\n",
      "\u001b[1m10fa6bc89c26d903481f191605b21f48\u001b[0m\n",
      "'Wir wollen Zivilklauseln in den Gesellschafterverträgen der Durchführungsorganisationen der deutschen Entwicklungszusammenarbeit verankern .'\n",
      "                                                         \u001b[44m   \u001b[49m \u001b[44m                           \u001b[49m \u001b[44m   \u001b[49m \u001b[44m         \u001b[49m \u001b[44m                          \u001b[49m            \t(sjasmin)\n"
     ]
    }
   ],
   "source": [
    "# print\n",
    "if args.verbose:\n",
    "    for ids in duplicates_ids.values():\n",
    "        print('\\n', '-'*100, sep='')\n",
    "        for id in ids:\n",
    "            print(acorp.docs[acorp.doc_id2idx[id]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# note: I've manually checked the cases where these very duplicate annotations.\n",
    "#  In most cases, the annotations from the same annotator for the same text (though diff. 'docs') are identical.>\n",
    "#  But in the few cases where this does not hold, I manually disambiguate.\n",
    "disambigute_duplicates = {\n",
    "    'e7b3f8d69a3b2fb6db83b9ea2817d90d': ['e7b3f8d69a3b2fb6db83b9ea2817d90d', 'de426127ca7c4af939382b58c0b63dc9'],\n",
    "    '62ae8599698ad61f0e6ba84dce622864': ['62ae8599698ad61f0e6ba84dce622864', '10fa6bc89c26d903481f191605b21f48']\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# resolve duplicates: for duplicated texts\n",
    "for ids in duplicates_ids.values():\n",
    "    # see in all but the first doc (the 'original')\n",
    "    if all([id in disambigute_duplicates.values() for id in ids]):\n",
    "        this = [this for this, pair in disambigute_duplicates.items() if all([_ in pair for _ in ids])]\n",
    "        for _ in disambigute_duplicates[this[0]]:\n",
    "            if _ not in this:\n",
    "                acorp.remove_documents([_])\n",
    "    for id in ids[1:]:\n",
    "        # id = ids[1]\n",
    "        # for each annotator\n",
    "        for annotator in acorp.docs[acorp.doc_id2idx[id]].annotators:\n",
    "            # whether the annotator already in the 'original'\n",
    "            if annotator in acorp.docs[acorp.doc_id2idx[ids[0]]].annotators:\n",
    "                # if so remove annotation\n",
    "                acorp.docs[acorp.doc_id2idx[id]].remove_annotation(annotator)\n",
    "        if acorp.docs[acorp.doc_id2idx[id]].n_annotations > 0:\n",
    "            acorp.merge_annotations([ids[0], id])\n",
    "        else:\n",
    "            acorp.remove_documents([id])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([1]), array([2923]))\n"
     ]
    }
   ],
   "source": [
    "# verify\n",
    "texts = Counter()\n",
    "for doc in acorp.docs:\n",
    "    texts.update([doc.text])\n",
    "print(np.unique(np.asarray(list(texts.values())), return_counts=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reset important corpus attributes\n",
    "acorp.doc_id2idx = {doc.id: i for i, doc in enumerate(acorp.docs)}\n",
    "acorp.doc_idx2id = {i: doc.id for i, doc in enumerate(acorp.docs)}\n",
    "acorp.annotator_label_counts = acorp._count_annotator_labels()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## clean tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "toks = set()\n",
    "all_chars = Counter()\n",
    "for doc in acorp.docs:\n",
    "    for tok in doc.tokens:\n",
    "        toks.add(tok)\n",
    "        all_chars.update([c for c in tok])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pd\tDash Punctuation\t['–', '-']\n",
      "Pe\tClose Punctuation\t[')']\n",
      "Pf\tFinal Punctuation\t['»', '’']\n",
      "Pi\tInitial Punctuation\t['“', '«']\n",
      "Po\tOther Punctuation\t['.', ',', ':', '!', '\"', '*', '/', '%', ';', '?', '…']\n",
      "Ps\tOpen Punctuation\t['„', '(']\n",
      "Sc\tCurrency Symbol\t['€']\n",
      "Sm\tMath Symbol\t['+']\n"
     ]
    }
   ],
   "source": [
    "from utils.unicode import CATEGORIES as char_cats\n",
    "\n",
    "for k, v in char_cats.items():\n",
    "    regx = r'\\p{'+k+'}'\n",
    "    m = [c for c in all_chars.keys() if regex.match(regx, c)]\n",
    "    if len(m) > 0:\n",
    "        print(k, end='\\t')\n",
    "        print(v, end='\\t')\n",
    "        print(m)\n",
    "# NOTE: no cleaning needed"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Write to disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs(os.path.dirname(args.output_file), exist_ok=True)\n",
    "if not os.path.exists(args.output_file) or args.overwrite_output:\n",
    "    acorp.save_as_jsonlines(args.output_file, encoding='utf-8')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "group_mention_detection",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
